[SLP]Initial FMAD support #149102

alexey-bataev · 2025-07-16T13:59:10Z

Added initial check for potential fmad conversion in reductions and
operands vectorization.

Created using spr 1.3.5

llvmbot · 2025-07-16T13:59:40Z

@llvm/pr-subscribers-backend-risc-v
@llvm/pr-subscribers-vectorizers

@llvm/pr-subscribers-llvm-transforms

Author: Alexey Bataev (alexey-bataev)

Changes

Added initial check for potential fma conversion in reductions and
operands vectorization.

Patch is 89.41 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/149102.diff

11 Files Affected:

(modified) llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp (+160-4)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll (+20-14)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll (+107-38)
(modified) llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll (+39-17)
(modified) llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll (+64-84)
(modified) llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll (+128-33)
(modified) llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll (+59-11)
(modified) llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll (+19-3)
(modified) llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll (+49-28)
(modified) llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll (+51-26)
(modified) llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll (+39-18)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 1b8a80d2b3c94..c94f49f941005 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -3759,6 +3759,7 @@ class BoUpSLP {
     enum CombinedOpcode {
       NotCombinedOp = -1,
       MinMax = Instruction::OtherOpsEnd + 1,
+      FMulAdd,
     };
     CombinedOpcode CombinedOp = NotCombinedOp;
 
@@ -3896,6 +3897,9 @@ class BoUpSLP {
 
     bool hasState() const { return S.valid(); }
 
+    /// Returns the state of the operations.
+    const InstructionsState &getOperations() const { return S; }
+
     /// When ReuseReorderShuffleIndices is empty it just returns position of \p
     /// V within vector of Scalars. Otherwise, try to remap on its reuse index.
     unsigned findLaneForValue(Value *V) const {
@@ -11563,6 +11567,84 @@ void BoUpSLP::reorderGatherNode(TreeEntry &TE) {
   }
 }
 
+static InstructionCost canConvertToFMA(ArrayRef<Value *> VL,
+                                       const InstructionsState &S,
+                                       DominatorTree &DT, const DataLayout &DL,
+                                       TargetTransformInfo &TTI,
+                                       const TargetLibraryInfo &TLI) {
+  assert(all_of(VL,
+                [](Value *V) {
+                  return V->getType()->getScalarType()->isFloatingPointTy();
+                }) &&
+         "Can only convert to FMA for floating point types");
+  assert(S.isAddSubLikeOp() && "Can only convert to FMA for add/sub");
+
+  auto CheckForContractable = [&](ArrayRef<Value *> VL) {
+    FastMathFlags FMF;
+    FMF.set();
+    for (Value *V : VL) {
+      auto *I = dyn_cast<Instruction>(V);
+      if (!I)
+        continue;
+      // TODO: support for copyable elements.
+      Instruction *MatchingI = S.getMatchingMainOpOrAltOp(I);
+      if (S.getMainOp() != MatchingI && S.getAltOp() != MatchingI)
+        continue;
+      if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+        FMF &= FPCI->getFastMathFlags();
+    }
+    return FMF.allowContract();
+  };
+  if (!CheckForContractable(VL))
+    return InstructionCost::getInvalid();
+  // fmul also should be contractable
+  InstructionsCompatibilityAnalysis Analysis(DT, DL, TTI, TLI);
+  SmallVector<BoUpSLP::ValueList> Operands = Analysis.buildOperands(S, VL);
+
+  InstructionsState OpS = getSameOpcode(Operands.front(), TLI);
+  if (!OpS.valid())
+    return InstructionCost::getInvalid();
+  if (OpS.isAltShuffle() || OpS.getOpcode() != Instruction::FMul)
+    return InstructionCost::getInvalid();
+  if (!CheckForContractable(Operands.front()))
+    return InstructionCost::getInvalid();
+  // Compare the costs.
+  InstructionCost FMulPlusFaddCost = 0;
+  InstructionCost FMACost = 0;
+  constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  FastMathFlags FMF;
+  FMF.set();
+  for (Value *V : VL) {
+    auto *I = dyn_cast<Instruction>(V);
+    if (!I)
+      continue;
+    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+      FMF &= FPCI->getFastMathFlags();
+    FMulPlusFaddCost += TTI.getInstructionCost(I, CostKind);
+  }
+  for (auto [V, Op] : zip(VL, Operands.front())) {
+    auto *I = dyn_cast<Instruction>(Op);
+    if (!I || !I->hasOneUse()) {
+      FMACost += TTI.getInstructionCost(cast<Instruction>(V), CostKind);
+      if (I)
+        FMACost += TTI.getInstructionCost(I, CostKind);
+      continue;
+    }
+    if (auto *FPCI = dyn_cast<FPMathOperator>(I))
+      FMF &= FPCI->getFastMathFlags();
+    FMulPlusFaddCost += TTI.getInstructionCost(I, CostKind);
+  }
+  const unsigned NumOps =
+      count_if(zip(VL, Operands.front()), [](const auto &P) {
+        return isa<Instruction>(std::get<0>(P)) &&
+               isa<Instruction>(std::get<1>(P));
+      });
+  Type *Ty = VL.front()->getType();
+  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);
+  FMACost += NumOps * TTI.getIntrinsicInstrCost(ICA, CostKind);
+  return FMACost < FMulPlusFaddCost ? FMACost : InstructionCost::getInvalid();
+}
+
 void BoUpSLP::transformNodes() {
   constexpr TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   BaseGraphSize = VectorizableTree.size();
@@ -11922,6 +12004,25 @@ void BoUpSLP::transformNodes() {
       }
       break;
     }
+    case Instruction::FSub:
+    case Instruction::FAdd: {
+      // Check if possible to convert (a*b)+c to fma.
+      if (E.State != TreeEntry::Vectorize ||
+          !E.getOperations().isAddSubLikeOp())
+        break;
+      if (!canConvertToFMA(E.Scalars, E.getOperations(), *DT, *DL, *TTI, *TLI)
+               .isValid())
+        break;
+      // This node is a fmuladd node.
+      E.CombinedOp = TreeEntry::FMulAdd;
+      TreeEntry *FMulEntry = getOperandEntry(&E, 0);
+      if (FMulEntry->UserTreeIndex &&
+          FMulEntry->State == TreeEntry::Vectorize) {
+        // The FMul node is part of the combined fmuladd node.
+        FMulEntry->State = TreeEntry::CombinedVectorize;
+      }
+      break;
+    }
     default:
       break;
     }
@@ -13152,6 +13253,12 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     }
     return IntrinsicCost;
   };
+  auto GetFMulAddCost = [&, &TTI = *TTI](const InstructionsState &S,
+                                         Instruction *VI = nullptr) {
+    InstructionCost Cost = canConvertToFMA(VI ? ArrayRef<Value *>(VI) : VL, S,
+                                           *DT, *DL, TTI, *TLI);
+    return Cost;
+  };
   switch (ShuffleOrOp) {
   case Instruction::PHI: {
     // Count reused scalars.
@@ -13492,6 +13599,30 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
     };
     return GetCostDiff(GetScalarCost, GetVectorCost);
   }
+  case TreeEntry::FMulAdd: {
+    auto GetScalarCost = [&](unsigned Idx) {
+      if (isa<PoisonValue>(UniqueValues[Idx]))
+        return InstructionCost(TTI::TCC_Free);
+      return GetFMulAddCost(E->getOperations(),
+                            cast<Instruction>(UniqueValues[Idx]));
+    };
+    auto GetVectorCost = [&, &TTI = *TTI](InstructionCost CommonCost) {
+      FastMathFlags FMF;
+      FMF.set();
+      for (Value *V : E->Scalars) {
+        if (auto *FPCI = dyn_cast<FPMathOperator>(V)) {
+          FMF &= FPCI->getFastMathFlags();
+          if (auto *FPCIOp = dyn_cast<FPMathOperator>(FPCI->getOperand(0)))
+            FMF &= FPCIOp->getFastMathFlags();
+        }
+      }
+      IntrinsicCostAttributes ICA(Intrinsic::fmuladd, VecTy,
+                                  {VecTy, VecTy, VecTy}, FMF);
+      InstructionCost VecCost = TTI.getIntrinsicInstrCost(ICA, CostKind);
+      return VecCost + CommonCost;
+    };
+    return GetCostDiff(GetScalarCost, GetVectorCost);
+  }
   case Instruction::FNeg:
   case Instruction::Add:
   case Instruction::FAdd:
@@ -13529,8 +13660,16 @@ BoUpSLP::getEntryCost(const TreeEntry *E, ArrayRef<Value *> VectorizedVals,
       }
       TTI::OperandValueInfo Op1Info = TTI::getOperandInfo(Op1);
       TTI::OperandValueInfo Op2Info = TTI::getOperandInfo(Op2);
-      return TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
+      InstructionCost ScalarCost = TTI->getArithmeticInstrCost(ShuffleOrOp, OrigScalarTy, CostKind,
                                          Op1Info, Op2Info, Operands);
+      if (auto *I = dyn_cast<Instruction>(UniqueValues[Idx]);
+          I && (ShuffleOrOp == Instruction::FAdd ||
+                ShuffleOrOp == Instruction::FSub)) {
+        InstructionCost IntrinsicCost = GetFMulAddCost(E->getOperations(), I);
+        if (IntrinsicCost.isValid())
+          ScalarCost = IntrinsicCost;
+      }
+      return ScalarCost;
     };
     auto GetVectorCost = [=](InstructionCost CommonCost) {
       if (ShuffleOrOp == Instruction::And && It != MinBWs.end()) {
@@ -22052,11 +22191,21 @@ class HorizontalReduction {
   /// Try to find a reduction tree.
   bool matchAssociativeReduction(BoUpSLP &R, Instruction *Root,
                                  ScalarEvolution &SE, const DataLayout &DL,
-                                 const TargetLibraryInfo &TLI) {
+                                 const TargetLibraryInfo &TLI,
+                                 DominatorTree &DT, TargetTransformInfo &TTI) {
     RdxKind = HorizontalReduction::getRdxKind(Root);
     if (!isVectorizable(RdxKind, Root))
       return false;
 
+    // FMA reduction root - skip.
+    auto CheckForFMA = [&](Instruction *I) {
+      return RdxKind == RecurKind::FAdd &&
+             canConvertToFMA(I, getSameOpcode(I, TLI), DT, DL, TTI, TLI)
+                 .isValid();
+    };
+    if (CheckForFMA(Root))
+      return false;
+
     // Analyze "regular" integer/FP types for reductions - no target-specific
     // types or pointers.
     Type *Ty = Root->getType();
@@ -22095,6 +22244,7 @@ class HorizontalReduction {
         // foldable.
         if (!EdgeInst || Level > RecursionMaxDepth ||
             getRdxKind(EdgeInst) != RdxKind ||
+            CheckForFMA(EdgeInst) ||
             IsCmpSelMinMax != isCmpSelMinMax(EdgeInst) ||
             !hasRequiredNumberOfUses(IsCmpSelMinMax, EdgeInst) ||
             !isVectorizable(RdxKind, EdgeInst) ||
@@ -23658,13 +23808,13 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   Stack.emplace(SelectRoot(), 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
-  auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
+  auto TryToReduce = [this, &R, TTI = TTI](Instruction *Inst) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     if (!isReductionCandidate(Inst))
       return nullptr;
     HorizontalReduction HorRdx;
-    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI))
+    if (!HorRdx.matchAssociativeReduction(R, Inst, *SE, *DL, *TLI, *DT, *TTI))
       return nullptr;
     return HorRdx.tryToReduce(R, *DL, TTI, *TLI, AC);
   };
@@ -23730,6 +23880,12 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
 
   if (!isa<BinaryOperator, CmpInst>(I) || isa<VectorType>(I->getType()))
     return false;
+  // Skip potential FMA candidates.
+  if ((I->getOpcode() == Instruction::FAdd ||
+       I->getOpcode() == Instruction::FSub) &&
+      canConvertToFMA(I, getSameOpcode(I, *TLI), *DT, *DL, *TTI, *TLI)
+          .isValid())
+    return false;
 
   Value *P = I->getParent();
 
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
index 442769937ac12..9e086dcad686c 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll
@@ -8,15 +8,18 @@ target triple = "aarch64--linux-gnu"
 define void @test1(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL11]], [[MUL12]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
@@ -47,15 +50,18 @@ for.end27:
 define void @test2(ptr nocapture readonly %J, i32 %xmin, i32 %ymin) {
 ; CHECK-LABEL: @test2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <2 x i32> poison, i32 [[XMIN:%.*]], i32 0
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> [[TMP0]], i32 [[YMIN:%.*]], i32 1
 ; CHECK-NEXT:    br label [[FOR_BODY3_LR_PH:%.*]]
 ; CHECK:       for.body3.lr.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = sitofp <2 x i32> [[TMP1]] to <2 x float>
-; CHECK-NEXT:    [[TMP4:%.*]] = load <2 x float>, ptr [[J:%.*]], align 4
-; CHECK-NEXT:    [[TMP5:%.*]] = fsub fast <2 x float> [[TMP2]], [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = fmul fast <2 x float> [[TMP5]], [[TMP5]]
-; CHECK-NEXT:    [[ADD:%.*]] = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> [[TMP6]])
+; CHECK-NEXT:    [[CONV5:%.*]] = sitofp i32 [[YMIN:%.*]] to float
+; CHECK-NEXT:    [[CONV:%.*]] = sitofp i32 [[XMIN:%.*]] to float
+; CHECK-NEXT:    [[TMP0:%.*]] = load float, ptr [[J:%.*]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub fast float [[CONV]], [[TMP0]]
+; CHECK-NEXT:    [[ARRAYIDX9:%.*]] = getelementptr inbounds [[STRUCTA:%.*]], ptr [[J]], i64 0, i32 0, i64 1
+; CHECK-NEXT:    [[TMP1:%.*]] = load float, ptr [[ARRAYIDX9]], align 4
+; CHECK-NEXT:    [[SUB10:%.*]] = fsub fast float [[CONV5]], [[TMP1]]
+; CHECK-NEXT:    [[MUL11:%.*]] = fmul fast float [[SUB]], [[SUB]]
+; CHECK-NEXT:    [[MUL12:%.*]] = fmul fast float [[SUB10]], [[SUB10]]
+; CHECK-NEXT:    [[ADD:%.*]] = fadd fast float [[MUL12]], [[MUL11]]
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp oeq float [[ADD]], 0.000000e+00
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY3_LR_PH]], label [[FOR_END27:%.*]]
 ; CHECK:       for.end27:
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
index 295a71899c338..2e684320ba10e 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll
@@ -12,7 +12,8 @@ define void @test() {
 ; CHECK:       [[BB63]]:
 ; CHECK-NEXT:    br label %[[BB64]]
 ; CHECK:       [[BB64]]:
-; CHECK-NEXT:    [[TMP25:%.*]] = phi <16 x float> [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I65:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
+; CHECK-NEXT:    [[I77:%.*]] = phi nsz float [ poison, %[[BB61]] ], [ poison, %[[BB63]] ], [ poison, %[[BB62]] ]
 ; CHECK-NEXT:    [[I66:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I67:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I68:%.*]] = load float, ptr poison, align 8
@@ -24,57 +25,125 @@ define void @test() {
 ; CHECK-NEXT:    [[I74:%.*]] = load float, ptr poison, align 4
 ; CHECK-NEXT:    [[I75:%.*]] = load float, ptr poison, align 16
 ; CHECK-NEXT:    [[I76:%.*]] = load float, ptr poison, align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x float> poison, float [[I76]], i32 0
-; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x float> [[TMP1]], float [[I75]], i32 1
-; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <16 x float> [[TMP2]], float [[I74]], i32 2
-; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <16 x float> [[TMP3]], float [[I73]], i32 3
-; CHECK-NEXT:    [[TMP5:%.*]] = insertelement <16 x float> [[TMP4]], float [[I71]], i32 4
-; CHECK-NEXT:    [[TMP6:%.*]] = insertelement <16 x float> [[TMP5]], float [[I70]], i32 5
-; CHECK-NEXT:    [[TMP7:%.*]] = insertelement <16 x float> [[TMP6]], float [[I68]], i32 6
-; CHECK-NEXT:    [[TMP8:%.*]] = insertelement <16 x float> [[TMP7]], float [[I66]], i32 7
-; CHECK-NEXT:    [[TMP9:%.*]] = insertelement <16 x float> [[TMP8]], float [[I72]], i32 13
-; CHECK-NEXT:    [[TMP10:%.*]] = insertelement <16 x float> [[TMP9]], float [[I67]], i32 14
-; CHECK-NEXT:    [[TMP11:%.*]] = insertelement <16 x float> [[TMP10]], float [[I69]], i32 15
 ; CHECK-NEXT:    br i1 poison, label %[[BB167:.*]], label %[[BB77:.*]]
 ; CHECK:       [[BB77]]:
-; CHECK-NEXT:    [[TMP12:%.*]] = shufflevector <16 x float> [[TMP11]], <16 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP17:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = shufflevector <8 x float> [[TMP12]], <8 x float> [[TMP17]], <8 x i32> <i32 8, i32 poison, i32 poison, i32 poison, i32 4, i32 5, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP14:%.*]] = insertelement <8 x float> poison, float [[I70]], i32 1
-; CHECK-NEXT:    [[TMP19:%.*]] = insertelement <8 x float> [[TMP14]], float [[I68]], i32 2
-; CHECK-NEXT:    [[TMP16:%.*]] = insertelement <8 x float> [[TMP19]], float [[I66]], i32 3
-; CHECK-NEXT:    [[TMP20:%.*]] = insertelement <8 x float> [[TMP16]], float [[I67]], i32 6
-; CHECK-NEXT:    [[TMP21:%.*]] = insertelement <8 x float> [[TMP20]], float [[I69]], i32 7
-; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <16 x float> [[TMP25]], <16 x float> poison, <16 x i32> <i32 poison, i32 poison, i32 3, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP13:%.*]] = shufflevector <16 x float> [[TMP39]], <16 x float> [[TMP25]], <16 x i32> <i32 poison, i32 poison, i32 2, i32 3, i32 18, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 19, i32 poison, i32 poison>
 ; CHECK-NEXT:    br label %[[BB78:.*]]
 ; CHECK:       [[BB78]]:
-; CHECK-NEXT:    [[TMP15:%.*]] = phi <8 x float> [ [[TMP23]], %[[BB77]] ], [ [[TMP36:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP22:%.*]] = phi <8 x float> [ [[TMP21]], %[[BB77]] ], [ [[TMP31:%.*]], %[[BB78]] ]
-; CHECK-NEXT:    [[TMP24:%.*]] = shufflevector <8 x float> [[TMP22]], <8 x float> poison, <16 x i32> <i32 0, i32 3, i32 1, i32 2, i32 3, i32 0, i32 2, i32 3, i32 2, i32 6, i32 2, i32 3, i32 0, i32 7, i32 6, i32 6>
-; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <8 x float> [[TMP15]], <8 x float> poison, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 1, i32 0, i32 3, i32 1, i32 3, i32 5, i32 3, i32 1, i32 0, i32 4, i32 5, i32 5>
-; CHECK-NEXT:    [[TMP18:%.*]] = fmul fast <16 x float> [[TMP24]], [[TMP13]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fmul fast <16 x float> [[TMP38]], [[TMP25]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fadd fast <16 x float> [[TMP26]], [[TMP18]]
-; CHECK-NEXT:    [[TMP28:%.*]] = fadd fast <16 x float> [[TMP27]], poison
-; CHECK-NEXT:    [[TMP29:%.*]] = fadd fast <16 x float> [[TMP28]], poison
-; CHECK-NEXT:    [[TMP36]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 5, i32 11, i32 12, i32 10, i32 14, i32 15, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP31]] = shufflevector <16 x float> [[TMP29]], <16 x float> poison, <8 x i32> <i32 12, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 14, i32 15>
+; CHECK-NEXT:    [[I85:%.*]] = phi nsz float [ [[I66]], %[[BB77]] ], [ [[I103:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I80:%.*]] = phi nsz float [ [[I67]], %[[BB77]] ], [ [[I104:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I81:%.*]] = phi nsz float [ [[I68]], %[[BB77]] ], [ [[I105:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I82:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I106:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I84:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I123:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I127:%.*]] = phi nsz float [ [[I69]], %[[BB77]] ], [ [[I124:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I131:%.*]] = phi nsz float [ poison, %[[BB77]] ], [ [[I125:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I86:%.*]] = phi nsz float [ [[I70]], %[[BB77]] ], [ [[I126:%.*]], %[[BB78]] ]
+; CHECK-NEXT:    [[I87:%.*]] = fmul fast float [[I85]], poison
+; CHECK-NEXT:    [[I88:%.*]] = fmul fast float [[I80]], poison
+; CHECK-NEXT:    [[I89:%...
[truncated]

Created using spr 1.3.5

github-actions · 2025-07-16T14:08:23Z

✅ With the latest revision this PR passed the C/C++ code formatter.

github-actions · 2025-07-16T14:08:23Z

⚠️ undef deprecator found issues in your code. ⚠️

You can test this locally with the following command:

git diff -U0 --pickaxe-regex -S '([^a-zA-Z0-9#_-]undef[^a-zA-Z0-9_-]|UndefValue::get)' 'HEAD~1' HEAD llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp llvm/test/Transforms/SLPVectorizer/AArch64/commute.ll llvm/test/Transforms/SLPVectorizer/AArch64/reused-scalar-repeated-in-node.ll llvm/test/Transforms/SLPVectorizer/AArch64/scalarization-overhead.ll llvm/test/Transforms/SLPVectorizer/RISCV/vec3-base.ll llvm/test/Transforms/SLPVectorizer/X86/dot-product.ll llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll llvm/test/Transforms/SLPVectorizer/X86/redux-feed-buildvector.ll llvm/test/Transforms/SLPVectorizer/X86/redux-feed-insertelement.ll llvm/test/Transforms/SLPVectorizer/X86/slp-fma-loss.ll llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll llvm/test/Transforms/SLPVectorizer/insertelement-postpone.ll

The following files introduce new uses of undef:

llvm/test/Transforms/SLPVectorizer/extracts-with-undefs.ll

Undef is now deprecated and should only be used in the rare cases where no replacement is possible. For example, a load of uninitialized memory yields undef. You should use poison values for placeholders instead.

In tests, avoid using undef and having tests that trigger undefined behavior. If you need an operand with some unimportant value, you can add a new argument to the function and use that instead.

For example, this is considered a bad practice:

define void @fn() {
  ...
  br i1 undef, ...
}

Please use the following instead:

define void @fn(i1 %cond) {
  ...
  br i1 %cond, ...
}

Please refer to the Undefined Behavior Manual for more information.

Created using spr 1.3.5

alexey-bataev · 2025-07-21T11:42:02Z

Ping!

RKSimon · 2025-07-21T13:08:55Z

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

+               isa<Instruction>(std::get<1>(P));
+      });
+  Type *Ty = VL.front()->getType();
+  IntrinsicCostAttributes ICA(Intrinsic::fmuladd, Ty, {Ty, Ty, Ty}, FMF);


Should you be handling both Intrinsic::fma and Intrinsic::fmuladd?

I thought that fmuladd is correct here. If the TTI sees that it can fold instructions into fma, it will return the same cost as fma. Otherwise, it will return the sum of original fadd+fmul, which allows for comparing the costs correctly.

davemgreen · 2025-07-21T13:21:44Z

Does anyone know why we do not generally convert fast fmul+fadd to fmuladd, to make cost-modelling easier? It would prevent us from doing things like inserting a shuffle between a fmul and an fadd, but maybe it prevents reductions in some cases? It might make cost-modelling in general simpler for any target with fma.

alexey-bataev · 2025-07-21T13:28:22Z

Does anyone know why we do not generally convert fast fmul+fadd to fmuladd, to make cost-modelling easier? It would prevent us from doing things like inserting a shuffle between a fmul and an fadd, but maybe it prevents reductions in some cases? It might make cost-modelling in general simpler for any target with fma.

Maybe, conservative canonical representation?

davemgreen · 2025-07-22T18:46:28Z

Maybe, conservative canonical representation?

Yeah it would be a much bigger change. Our costs are a bit odd I think (fmul is more expensive than fmuladd). I'll look into that, but this patch sounds sensible to me.

Created using spr 1.3.5

alexey-bataev · 2025-08-04T19:47:55Z

Ping!

RKSimon · 2025-08-05T09:50:48Z

Shouldn't the title be "[SLP]Initial FMAD support"?

Created using spr 1.3.5

alexey-bataev · 2025-08-06T11:34:21Z

Shouldn't the title be "[SLP]Initial FMAD support"?

Done

RKSimon

A few minors

llvm/docs/ReleaseNotes.md

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp

Created using spr 1.3.5

RKSimon

LGTM

eugeneepshteyn · 2025-08-08T14:39:37Z

@alexey-bataev , could you please take a look at #152683 ? That issue may be related to your change, although we didn't yet do complete bisect.

alexey-bataev · 2025-08-08T14:48:15Z

@alexey-bataev , could you please take a look at #152683 ? That issue may be related to your change, although we didn't yet do complete bisect.

Yes, looks like it was triggered by this patch. Going to revert it and land again after fixing the issue

ronlieb · 2025-08-08T14:58:03Z

out nightly CI noticed that this patch breaks the build of spec cpu2017 510 511 526 and omp2012 362
locally reverting seems to resolve

gregbedwell · 2025-08-08T15:45:58Z

As I'm assuming that it's the same bug I've added another testcase ( https://godbolt.org/z/1ThE7nf6v ) to #152683. This was reduced from an assertion failure building a linear algebra library using clang.

This reverts commit 0bcf45e to fix the regresions, reported in #152683

alexey-bataev · 2025-08-08T16:26:42Z

As I'm assuming that it's the same bug I've added another testcase ( https://godbolt.org/z/1ThE7nf6v ) to #152683. This was reduced from an assertion failure building a linear algebra library using clang.

Thanks, it helps a lot

davemgreen · 2025-08-08T17:04:18Z

A note we saw some fallout from this in internal performance testing too. Something like this example doing a reduce of a fmul under fast-math is no longer vectorizing from the SLP vectorizer: https://godbolt.org/z/rYWM7dxEj. It wasn't helped on AArch64 by a different set of cost calculations that marked a fmul the same cost as a fma, but that example is x86. The original fadds in a reduction can be combined into a fma, but the expanded reduction will still become fma for part of it.

alexey-bataev · 2025-08-08T17:19:01Z

A note we saw some fallout from this in internal performance testing too. Something like this example doing a reduce of a fmul under fast-math is no longer vectorizing from the SLP vectorizer: https://godbolt.org/z/rYWM7dxEj. It wasn't helped on AArch64 by a different set of cost calculations that marked a fmul the same cost as a fma, but that example is x86. The original fadds in a reduction can be combined into a fma, but the expanded reduction will still become fma for part of it.

There should be a follow up patch to support fma-based reduction

Added initial check for potential fmad conversion in reductions and operands vectorization. Added the check for instruction to fix #152683

This reverts commit 0fffb9f due to major performance regressions.

davemgreen · 2025-08-10T14:57:06Z

A note we saw some fallout from this in internal performance testing too. Something like this example doing a reduce of a fmul under fast-math is no longer vectorizing from the SLP vectorizer: https://godbolt.org/z/rYWM7dxEj. It wasn't helped on AArch64 by a different set of cost calculations that marked a fmul the same cost as a fma, but that example is x86. The original fadds in a reduction can be combined into a fma, but the expanded reduction will still become fma for part of it.

There should be a follow up patch to support fma-based reduction

Sounds great. The performance regressions were pretty large and people here wouldn't be happy with something so large being broken for any length of time. Its a multiply accumulate after all, they come up everywhere. I've added some phase-ordering tests in a976843.

alexey-bataev · 2025-08-10T15:00:17Z

A note we saw some fallout from this in internal performance testing too. Something like this example doing a reduce of a fmul under fast-math is no longer vectorizing from the SLP vectorizer: https://godbolt.org/z/rYWM7dxEj. It wasn't helped on AArch64 by a different set of cost calculations that marked a fmul the same cost as a fma, but that example is x86. The original fadds in a reduction can be combined into a fma, but the expanded reduction will still become fma for part of it.

There should be a follow up patch to support fma-based reduction

Sounds great. The performance regressions were pretty large and people here wouldn't be happy with something so large being broken for any length of time. Its a multiply accumulate after all, they come up everywhere. I've added some phase-ordering tests in a976843.

https://github.com/llvm/llvm-project/pull/152787/files/611246c7328ac226ac3773eadf57f92c7ef648cc..5bbf933ff0236d13b06b13f2d0e0c4289f64111b

Added initial check for potential fmad conversion in reductions and operands vectorization. Added the check for instruction to fix #152683 Skipped the code for reduction to avoid regressions.

[𝘀𝗽𝗿] initial version

c18b069

Created using spr 1.3.5

llvmbot added backend:RISC-V vectorizers llvm:transforms labels Jul 16, 2025

Rebase

894197a

Created using spr 1.3.5

alexey-bataev requested review from RKSimon, hiraditya and davemgreen July 16, 2025 14:06

Rebase, fix formatting

277023c

Created using spr 1.3.5

davemgreen requested a review from fhahn July 17, 2025 10:28

RKSimon reviewed Jul 21, 2025

View reviewed changes

alexey-bataev added 2 commits July 24, 2025 15:06

Rebase, test updates, release notes

93cd291

Created using spr 1.3.5

Rebase

f37fec5

Created using spr 1.3.5

Address comments, rebase

8d03730

Created using spr 1.3.5

alexey-bataev changed the title ~~[SLP]Initial FMA support~~ [SLP]Initial FMAD support Aug 6, 2025

RKSimon reviewed Aug 6, 2025

View reviewed changes

llvm/docs/ReleaseNotes.md Outdated Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Show resolved Hide resolved

llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp Show resolved Hide resolved

Address comments

005c1dc

Created using spr 1.3.5

RKSimon approved these changes Aug 7, 2025

View reviewed changes

alexey-bataev merged commit 0bcf45e into main Aug 7, 2025
9 of 10 checks passed

alexey-bataev deleted the users/alexey-bataev/spr/slpinitial-fma-support branch August 7, 2025 13:51

gregbedwell mentioned this pull request Aug 8, 2025

New flang regression: Assertion `isa<To>(Val) && "cast<Ty>() argument of incompatible type!"' failed #152683

Closed

alexey-bataev added a commit that referenced this pull request Aug 8, 2025

Revert "[SLP]Initial FMAD support (#149102)"

0419b45

This reverts commit 0bcf45e to fix the regresions, reported in #152683

alexey-bataev added a commit that referenced this pull request Aug 8, 2025

[SLP]Initial FMAD support (#149102)

0fffb9f

Added initial check for potential fmad conversion in reductions and operands vectorization. Added the check for instruction to fix #152683

davemgreen added a commit that referenced this pull request Aug 10, 2025

Revert "[SLP]Initial FMAD support (#149102)"

cfe1909

This reverts commit 0fffb9f due to major performance regressions.

davemgreen mentioned this pull request Aug 13, 2025

[AArch64] Change the cost of fma and fmuladd to match fmul. #152963

Open

[SLP]Initial FMAD support #149102

[SLP]Initial FMAD support #149102

Uh oh!

Conversation

alexey-bataev commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

llvmbot commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

github-actions bot commented Jul 16, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

alexey-bataev commented Jul 21, 2025

Uh oh!

RKSimon Jul 21, 2025

Choose a reason for hiding this comment

Uh oh!

alexey-bataev Jul 21, 2025

Choose a reason for hiding this comment

Uh oh!

davemgreen commented Jul 21, 2025

Uh oh!

alexey-bataev commented Jul 21, 2025

Uh oh!

davemgreen commented Jul 22, 2025

Uh oh!

alexey-bataev commented Aug 4, 2025

Uh oh!

RKSimon commented Aug 5, 2025

Uh oh!

alexey-bataev commented Aug 6, 2025

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

Uh oh!

Uh oh!

RKSimon left a comment

Choose a reason for hiding this comment

Uh oh!

Uh oh!

eugeneepshteyn commented Aug 8, 2025

Uh oh!

alexey-bataev commented Aug 8, 2025

Uh oh!

ronlieb commented Aug 8, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

gregbedwell commented Aug 8, 2025

Uh oh!

alexey-bataev commented Aug 8, 2025

Uh oh!

davemgreen commented Aug 8, 2025

Uh oh!

alexey-bataev commented Aug 8, 2025

Uh oh!

davemgreen commented Aug 10, 2025

Uh oh!

alexey-bataev commented Aug 10, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

Uh oh!

Uh oh!

alexey-bataev commented Jul 16, 2025 •

edited

Loading

llvmbot commented Jul 16, 2025 •

edited

Loading

github-actions bot commented Jul 16, 2025 •

edited

Loading

github-actions bot commented Jul 16, 2025 •

edited

Loading

ronlieb commented Aug 8, 2025 •

edited

Loading

alexey-bataev commented Aug 10, 2025 •

edited

Loading